1 Setup

suppressPackageStartupMessages(library(data.table))
suppressPackageStartupMessages(library(DESeq2))
suppressPackageStartupMessages(library(gplots))
suppressPackageStartupMessages(library(hyperSpec))
suppressPackageStartupMessages(library(parallel))
suppressPackageStartupMessages(library(pander))
suppressPackageStartupMessages(library(plotly))
suppressPackageStartupMessages(library(RColorBrewer))
suppressPackageStartupMessages(library(scatterplot3d))
suppressPackageStartupMessages(library(tidyverse))
suppressPackageStartupMessages(library(tximport))
suppressPackageStartupMessages(library(vsn))
suppressPackageStartupMessages(library(here))
source(here("UPSCb-common/src/R/plot.multidensity.R"))
source(here("UPSCb-common/src/R/featureSelection.R"))
pal <- brewer.pal(12,"Paired")
hpal <- colorRampPalette(c("blue","white","red"))(100)
mar <- par("mar")
samples <- read_csv("~/Git/zygoticEmbryogenesis/doc/ZE_Dataset_v3.csv",
                    col_types = cols(col_character(),
                                     col_character(),
                                     col_factor(),
                                     col_character(),
                                     col_factor(),
                                     col_character(),
                                     col_double(),
                                     col_double())) %>% 
  mutate(Tissue=factor(Tissue)) %>% 
  mutate(Time=factor(Time))

2 Analysis

2.1 Raw data

lb.filelist <- list.files(here("data/RNA-Seq/salmon"), 
                          recursive = TRUE, 
                          pattern = "quant.sf",
                          full.names = TRUE)

Filter and select only one duplicate from each sample.

lb.filterlist <- str_subset(lb.filelist,"L001_sortmerna_trimmomatic")

#removed P11562_112 trimmomatic dataset
lb.filterlist <- (str_subset(lb.filterlist, "/mnt/picea/home/mstewart/Git/zygoticEmbryogenesis/data/RNA-Seq/salmon/P11562_112_S11_L001_sortmerna_trimmomatic/quant.sf", negate = TRUE))
#removed P11562_112 row from sample list
samples <- filter(samples, !grepl("112",NGI.ID))
#samples <- filter(samples, !grepl("P11562_201",NGI.ID)) NOT REMOVING THIS SAMPLE ANYMORE, FIXED FROM FMG TO S
#lb.filterlist <- (str_subset(lb.filterlist, "P11562_201", negate = TRUE)) NOT REMOVING THIS SAMPLE ANYMORE, FIXED FROM FMG TO S

stopifnot(all(str_which(lb.filterlist, samples$NGI.ID) == 1:length(lb.filterlist)))
##assign names to the filtered filelist (which removed L002)
names(lb.filterlist) <- samples$NGI.ID
lb.filterlist <- lb.filterlist[samples$Tissue %in% c("ZE","FMG","S")] ##Are we only looking at ZE?
samples <- subset(samples, select = -c(User.ID, Sample.ID, Replicate, Mreads, X..Q30) )

Read the expression at the gene level (there is one transcript per gene)

lb.g <- suppressMessages(tximport(files = lb.filterlist, 
                                  type = "salmon",txOut=TRUE))

counts <- round(lb.g$counts)

2.2 Raw Data QC analysis

Check how many genes are never expressed - reasonable level of non-expressed genes indicated.

sel <- rowSums(counts) == 0
sprintf("%s%% percent (%s) of %s genes are not expressed",
        round(sum(sel) * 100/ nrow(counts),digits=1),
        sum(sel),
        nrow(counts))
## [1] "19.5% percent (12910) of 66069 genes are not expressed"
ggplot(tibble(x=colnames(counts),y=colSums(counts)) %>% 
         bind_cols(samples[match(names(lb.filterlist),samples$NGI.ID),]),
       aes(x,y,col=Tissue,fill=Time)) + geom_col() + 
  scale_y_continuous(name="reads") +
  theme(axis.text.x=element_text(angle=90),axis.title.x=element_blank())

ggplot(tibble(x=colnames(counts),y=colSums(counts)) %>% 
         bind_cols(samples[match(names(lb.filterlist),samples$NGI.ID),]),
       aes(x,y,col=Tissue,fill=)) + geom_col() + 
  scale_y_continuous(name="reads") +
  theme(axis.text.x=element_text(angle=90),axis.title.x=element_blank())

Display the per-gene mean expression

i.e. the mean raw count of every gene across samples is calculated and displayed on a log10 scale.

The cumulative gene coverage is as expected

ggplot(melt(log10(rowMeans(counts))),aes(x=value)) + 
  geom_density() + ggtitle("gene mean raw counts distribution") +
  scale_x_continuous(name="mean raw counts (log10)")
## Warning: Removed 12910 rows containing non-finite values (stat_density).

The same is done for the individual samples colored by condition. The gene coverage across samples is extremely similar

dat <- as.data.frame(log10(counts)) %>% utils::stack() %>% 
  mutate(Tissue=samples$Tissue[match(ind,samples$NGI.ID)]) %>% 
  mutate(Time=samples$Time[match(ind,samples$NGI.ID)])

Color by Experiment ####OBJECT TISSUE NOT FOUND############################################################

ggplot(dat,aes(x=values,group=ind,col=Tissue)) + 
  geom_density() + ggtitle("sample raw counts distribution") +
  scale_x_continuous(name="per gene raw counts (log10)")
## Warning: Removed 1767086 rows containing non-finite values (stat_density).

Color by Time

ggplot(dat,aes(x=values,group=ind,col=Time)) + 
  geom_density() + ggtitle("sample raw counts distribution") +
  scale_x_continuous(name="per gene raw counts (log10)")
## Warning: Removed 1767086 rows containing non-finite values (stat_density).

2.3 Export

dir.create(here("analysis","salmon"),showWarnings=FALSE,recursive=TRUE)
write.csv(counts,file=here("analysis/salmon/ZE-unnormalised-gene-expression_data.csv"))
############## change export location name

2.4 Data normalisation

2.4.1 Preparation

For visualization, the data is submitted to a variance stabilization transformation using DESeq2. The dispersion is estimated independently of the sample tissue and replicate

dds <- DESeqDataSetFromMatrix(
  countData = counts,
  colData = samples,
  design = ~ Time - Tissue)
## converting counts to integer mode
##############################does not like *, + or /, will only take - as the model.
save(dds,file=here("analysis/salmon/ZE-Dataset-dds.rda"))

Check the size factors (i.e. the sequencing library size effect)

The sequencing depth is relatively variable (0 to 200 %) however the low end of the variance is likely due to poor samples where little sequencing data was actually produce that wasnt 16s bacterial.

dds <- estimateSizeFactors(dds)
sizes <- sizeFactors(dds)
pander(sizes)
Table continues below
P11562_101 P11562_102 P11562_103 P11562_104 P11562_105 P11562_106
1.15 0.6652 0.6665 0.9087 0.8328 1.123
Table continues below
P11562_107 P11562_108 P11562_110 P11562_111 P11562_113 P11562_114
0.9761 0.7257 0.5128 1.581 1.415 0.6632
Table continues below
P11562_115 P11562_116 P11562_117 P11562_118 P11562_119 P11562_120
1 0.2937 1.191 0.5468 1.041 0.7201
Table continues below
P11562_121 P11562_123 P11562_124 P11562_125 P11562_126 P11562_127
1.332 1.122 0.4229 1.206 0.5732 1.553
Table continues below
P11562_128 P11562_129 P11562_130 P11562_131 P11562_133 P11562_134
1.097 1.243 1.113 1.23 1.754 1.175
Table continues below
P11562_135 P11562_136 P11562_137 P11562_138 P11562_139 P11562_140
1.565 1.276 1.076 0.8371 1.088 1.05
Table continues below
P11562_141 P11562_142 P11562_143 P11562_145 P11562_146 P11562_147
1.742 1.592 0.9982 1.499 0.908 1.003
Table continues below
P11562_148 P11562_149 P11562_150 P11562_151 P11562_153 P11562_154
1.767 1.052 1.634 1.285 0.5884 1.082
Table continues below
P11562_155 P11562_157 P11562_201 P11562_202 P11562_203 P11562_204
1.781 1.234 0.7672 1.11 1.309 1.103
P11562_205 P11562_206
1.106 0.6848
boxplot(sizes, main="Sequencing libraries size factor")

2.5 Variance Stabilising Transformation

vsd <- varianceStabilizingTransformation(dds, blind=FALSE)
vst <- assay(vsd)
vst <- vst - min(vst)
  • Validation

The variance stabilisation worked very well, the data is nearly homoscesdastic

meanSdPlot(vst[rowSums(counts)>0,])

2.6 QC on the normalised data

2.6.1 PCA

pc <- prcomp(t(vst))

percent <- round(summary(pc)$importance[2,]*100)

2.6.2 3 first dimensions

Seems that different time points form small clusters and ZE and FMG tissue types appear to separate. These are Comp1 and Comp2 which explains the different between most of the sampels except for one B4 ZE sample. Appears to be an outlier. This seems to indicate that the Tissue and Time components explain the difference between samples.

mar=c(5.1,4.1,4.1,2.1)
scatterplot3d(pc$x[,1],
              pc$x[,2],
              pc$x[,3],
              xlab=paste("Comp. 1 (",percent[1],"%)",sep=""),
              ylab=paste("Comp. 2 (",percent[2],"%)",sep=""),
              zlab=paste("Comp. 3 (",percent[3],"%)",sep=""),
              color=pal[as.integer(samples$Time)],
              pch=c(17,19,15)[as.integer(samples$Tissue)])
legend("topleft",
       fill=pal[1:nlevels(samples$Time)],
       legend=levels(samples$Time))
legend("bottomright",
       pch=c(17,19,15),
       legend=levels(samples$Tissue))

par(mar=mar)

2.6.3 2D

pc.dat <- bind_cols(PC1=pc$x[,1],
                    PC2=pc$x[,2],
                    samples)

ggplot(pc.dat,aes(x=PC1,y=PC2,col=Time,shape=Tissue)) + 
  geom_point(size=2) + 
  ggtitle("Principal Component Analysis",subtitle="variance stabilized counts") +
  scale_x_continuous(name=element_text(paste("PC1 (",percent[1],"%)",sep=""))) +
  scale_y_continuous(name=element_text(paste("PC2 (",percent[2],"%)",sep="")))

2.6.4 Interactive PCA Plot

suppressPackageStartupMessages(library(plotly))

interplot <- ggplot(pc.dat,aes(x=PC1,y=PC2,col=Time,shape=Tissue,text=NGI.ID)) +
  geom_point(size=2) +
  ggtitle("Principal Component Analysis",subtitle="variance stabilized counts")

ggplotly(interplot, tooltip = "all") %>% layout(xaxis=list(title=paste("PC1 (",percent[1],"%)",sep="")),
                       yaxis=list(title=paste("PC2 (",percent[2],"%)",sep="")))

2.6.5 Heatmap

Filter for noise A cutoff at a VST value of 1 leaves about 32000 genes - is this adequate for the QA?

conds <- factor(paste(samples$Tissue,samples$Time))
sels <- rangeFeatureSelect(counts=vst,
                           conditions=conds,
                           nrep=3)
## Warning in xy.coords(x, y, xlabel, ylabel, log): 1 y value <= 0 omitted
## from logarithmic plot

vstCutoff <- 4+1
  • Heatmap of “all” genes Taking into account all the genes (above a noise thresholds), the samples cluster according to what we also see in the mapping rate plot, i.e. there is a correlation with the amount of sequences in the samples. It appears that generally there is little difference between the samples across all genes
  • a small difference is noticable between ZE tissues and FMG-S tissues, however generally the expression levels are relatively balanced apart from the one sample in FMG B4. Somatic samples had a small section of more highly expressed genes compared to ZE and FMG.
heatmap.2(t(scale(t(vst[sels[[vstCutoff]],]))),
          distfun=pearson.dist,
          hclustfun=function(X){hclust(X,method="ward.D2")},
          labRow = NA,trace = "none",
          labCol = conds,
          col=hpal)

  • Heatmap of the 1000 most variable genes
ord <- order(rowSds(vst[sels[[vstCutoff]],]),decreasing=TRUE) [1:1000]

Cluster of subset compared to overall appears to be similar in shape with a shift in increased VST expression and slightly more spread out density. These most variable genes appear to have higher levels of VST expression.

ggplot(list(sub=rowMeans(vst[sels[[vstCutoff]],][ord,]),
            total=rowMeans(vst[sels[[vstCutoff]],])) %>%  melt(),
       aes(x=value,col=L1)) + 
  geom_density() +
  ggtitle("Density of the subset vs. overall") + 
  scale_x_continuous(name=element_text("VST expression")) + 
  theme(legend.title=element_blank())

Variance in expression in different tissues can be seen. Certain tissues samples appear the same as in other tissues. It appears that tissues have different expression patterns. It can also be seen that over different time points, that gene expression between the samples change slightly over time.

heatmap.2(t(scale(t(vst[sels[[vstCutoff]],][ord,]))),
          distfun=pearson.dist,
          hclustfun=function(X){hclust(X,method="ward.D2")},
          labRow = NA,trace = "none",
          labCol = conds,
          col=hpal)

  • Heatmap of the 1000 least variable genes
ord <- order(rowSds(vst[sels[[vstCutoff]],])) [1:1000]

The subset is enriched for two subsets of genes - ones which have relatively the same level of VST expression as the total gene subset but also a much higher population of more highly VST expressed genes.

ggplot(list(sub=rowMeans(vst[sels[[vstCutoff]],][ord,]),
            total=rowMeans(vst[sels[[vstCutoff]],])) %>%  melt(),
       aes(x=value,col=L1)) + 
  geom_density() +
  ggtitle("Density of the subset vs. overall") + 
  scale_x_continuous(name=element_text("VST expression")) + 
  theme(legend.title=element_blank())

The clustering for the least variable genes shows a very small change in separation of gene expression by Tissue and Time. There is a more stark difference in the Somatic samples.

heatmap.2(t(scale(t(vst[sels[[vstCutoff]],][ord,]))),
          distfun=pearson.dist,
          hclustfun=function(X){hclust(X,method="ward.D2")},
          labRow = NA,trace = "none",
          labCol = conds,
          col=hpal)

2.7 Conclusion ###################### make separate notes for this.

The data appears to show a correlation between Tissue and Time, with some clustering in the PCA plot showing a movement of time in ascending order from right to left (+ to -) along the X axis. There can also be seen that along the Y axis, there are subgroups which correlate to the Tissue type. Along with this, it appears that overall gene expression does not appear starkly different but slight differences can be observed between Tissue types. It appears that the most difference between Tissues can be seen in the 1000 most variable genes, with some small changes between Time points within those Tissue groups. The final heat map of the 1000 least variable genes appear to show a very mixed expression pattern, similar in appearance to the total gene heat map, between Tissue groups and Time points, except for in the Somatic tissue type.

3 Session Info

## R version 3.6.1 (2019-07-05)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.3 LTS
## 
## Matrix products: default
## BLAS/LAPACK: /usr/lib/x86_64-linux-gnu/libopenblasp-r0.2.20.so
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
##  [1] grid      parallel  stats4    stats     graphics  grDevices utils    
##  [8] datasets  methods   base     
## 
## other attached packages:
##  [1] here_0.1                    vsn_3.52.0                 
##  [3] tximport_1.12.3             forcats_0.4.0              
##  [5] stringr_1.4.0               dplyr_0.8.3                
##  [7] purrr_0.3.2                 readr_1.3.1                
##  [9] tidyr_0.8.3                 tibble_2.1.3               
## [11] tidyverse_1.2.1             scatterplot3d_0.3-41       
## [13] RColorBrewer_1.1-2          plotly_4.9.0               
## [15] pander_0.6.3                hyperSpec_0.99-20180627    
## [17] ggplot2_3.2.1               lattice_0.20-38            
## [19] gplots_3.0.1.1              DESeq2_1.24.0              
## [21] SummarizedExperiment_1.14.1 DelayedArray_0.10.0        
## [23] BiocParallel_1.18.1         matrixStats_0.54.0         
## [25] Biobase_2.44.0              GenomicRanges_1.36.0       
## [27] GenomeInfoDb_1.20.0         IRanges_2.18.2             
## [29] S4Vectors_0.22.0            BiocGenerics_0.30.0        
## [31] data.table_1.12.2          
## 
## loaded via a namespace (and not attached):
##   [1] colorspace_1.4-1       rprojroot_1.3-2        htmlTable_1.13.1      
##   [4] XVector_0.24.0         base64enc_0.1-3        rstudioapi_0.10       
##   [7] hexbin_1.27.3          affyio_1.54.0          bit64_0.9-7           
##  [10] AnnotationDbi_1.46.1   lubridate_1.7.4        xml2_1.2.2            
##  [13] splines_3.6.1          geneplotter_1.62.0     knitr_1.24            
##  [16] zeallot_0.1.0          Formula_1.2-3          jsonlite_1.6          
##  [19] broom_0.5.2            annotate_1.62.0        cluster_2.1.0         
##  [22] shiny_1.3.2            BiocManager_1.30.4     compiler_3.6.1        
##  [25] httr_1.4.1             backports_1.1.4        assertthat_0.2.1      
##  [28] Matrix_1.2-17          lazyeval_0.2.2         limma_3.40.6          
##  [31] cli_1.1.0              later_0.8.0            acepack_1.4.1         
##  [34] htmltools_0.3.6        tools_3.6.1            gtable_0.3.0          
##  [37] glue_1.3.1             GenomeInfoDbData_1.2.1 affy_1.62.0           
##  [40] reshape2_1.4.3         Rcpp_1.0.2             cellranger_1.1.0      
##  [43] vctrs_0.2.0            preprocessCore_1.46.0  gdata_2.18.0          
##  [46] nlme_3.1-141           crosstalk_1.0.0        xfun_0.9              
##  [49] testthat_2.2.1         rvest_0.3.4            mime_0.7              
##  [52] gtools_3.8.1           XML_3.98-1.20          zlibbioc_1.30.0       
##  [55] scales_1.0.0           promises_1.0.1         hms_0.5.1             
##  [58] yaml_2.2.0             memoise_1.1.0          gridExtra_2.3         
##  [61] rpart_4.1-15           latticeExtra_0.6-28    stringi_1.4.3         
##  [64] RSQLite_2.1.2          highr_0.8              genefilter_1.66.0     
##  [67] checkmate_1.9.4        caTools_1.17.1.2       rlang_0.4.0           
##  [70] pkgconfig_2.0.2        bitops_1.0-6           evaluate_0.14         
##  [73] labeling_0.3           htmlwidgets_1.3        bit_1.1-14            
##  [76] tidyselect_0.2.5       plyr_1.8.4             magrittr_1.5          
##  [79] R6_2.4.0               generics_0.0.2         Hmisc_4.2-0           
##  [82] DBI_1.0.0              pillar_1.4.2           haven_2.1.1           
##  [85] foreign_0.8-72         withr_2.1.2            survival_2.44-1.1     
##  [88] RCurl_1.95-4.12        nnet_7.3-12            modelr_0.1.5          
##  [91] crayon_1.3.4           KernSmooth_2.23-15     rmarkdown_1.15        
##  [94] locfit_1.5-9.1         readxl_1.3.1           blob_1.2.0            
##  [97] digest_0.6.20          xtable_1.8-4           httpuv_1.5.1          
## [100] munsell_0.5.0          viridisLite_0.3.0